3 primaryTranscriptAnnotation (PTA)
The purpose of this analysis is to define primary transcripts
3.1 Data pre-process
#! /bin/sh
#SBATCH --job-name=mergebw.sh # name for job
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 10
#SBATCH -p general
#SBATCH --qos=general
#SBATCH --mem=32G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=ssun@uchc.edu
#SBATCH -o mergebw.sh_%j.out
#SBATCH -e mergebw.sh_%j.err
export PATH=$PATH:/home/FCAM/ssun/packages/://home/FCAM/ssun/scripts
module load ucsc_genome/2012.05.22
module load R/4.1.2
PRO_normalization -c /home/FCAM/ssun/Genome_pro/hg38.chrom.sizes
# PRO_normalization assumes the naming convention CELL_COND_TIME_ETC_rep<#>_<plus><minus>.bigWig for unpaired reads or
#CELL_COND_TIME_ETC_rep<#>_<plus><minus>_PE<1><2>.bigWig for paired reads,
#and assumes that you are running the script in the directory with your bigWigs
3.1.1 Input for PTA
get the merged bigWigs (minus and plus) for primaryTranscriptAnnotation (PE1 data)
#! /bin/sh
#SBATCH --job-name=mergebw2.sh # name for job
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 10
#SBATCH -p general
#SBATCH --qos=general
#SBATCH --mem=32G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=ssun@uchc.edu
#SBATCH -o mergebw2.sh_%j.out
#SBATCH -e mergebw2.sh_%j.err
export PATH=$PATH:/home/FCAM/ssun/packages/://home/FCAM/ssun/scripts
module load ucsc_genome/2012.05.22
reps=12
pair="_PE1"
name="MCF7_dTAGGATAClone522_30min"
chrSizes=/home/FCAM/ssun/Genome_pro/hg38.chrom.sizes
plusfiles=$(ls ${name}_*rep*_plus${pair}_scaled.bigWig)
bigWigMerge $plusfiles tmpPlus${pair}.bg
minusfiles=$(ls ${name}_*rep*_minus${pair}_scaled.bigWig)
bigWigMerge -threshold=-10000000000 $minusfiles tmpMinus${pair}.bg
scaleall=$(bc <<< "scale=4 ; 1.0 / $reps")
normalize_bedGraph.py -i tmpPlus${pair}.bg -s $scaleall -o ${name}_plus${pair}_scaled.bg
normalize_bedGraph.py -i tmpMinus${pair}.bg -s $scaleall -o ${name}_minus${pair}_scaled.bg
sort -k1,1 -k2,2n ${name}_plus${pair}_scaled.bg > ${name}_plus${pair}_scaled_sorted.bg
sort -k1,1 -k2,2n ${name}_minus${pair}_scaled.bg > ${name}_minus${pair}_scaled_sorted.bg
bedGraphToBigWig ${name}_plus${pair}_scaled_sorted.bg $chrSizes ${name}_plus${pair}_scaled.bigWig
bedGraphToBigWig ${name}_minus${pair}_scaled_sorted.bg $chrSizes ${name}_minus${pair}_scaled.bigWig
rm ${name}_plus${pair}_scaled.bg
rm ${name}_minus${pair}_scaled.bg
rm ${name}_plus${pair}_scaled_sorted.bg
rm ${name}_minus${pair}_scaled_sorted.bg
rm tmpPlus${pair}.bg
rm tmpMinus${pair}.bg
3.1.2 Input for TSS
get the merged plus and minus for TSS inference (PE2 data)
#! /bin/sh
#SBATCH --job-name=mergebw3.sh # name for job
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 10
#SBATCH -p general
#SBATCH --qos=general
#SBATCH --mem=32G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=ssun@uchc.edu
#SBATCH -o mergebw3.sh_%j.out
#SBATCH -e mergebw3.sh_%j.err
export PATH=$PATH:/home/FCAM/ssun/packages/://home/FCAM/ssun/scripts
module load ucsc_genome/2012.05.22
reps=12
pair="_PE2"
name="MCF7_dTAGGATAClone522_30min"
chrSizes=/home/FCAM/ssun/Genome_pro/hg38.chrom.sizes
plusfiles=$(ls ${name}_*rep*_plus${pair}_scaled.bigWig)
bigWigMerge $plusfiles tmpPlus${pair}.bg
minusfiles=$(ls ${name}_*rep*_minus${pair}_scaled.bigWig)
bigWigMerge -threshold=-10000000000 $minusfiles tmpMinus${pair}.bg
scaleall=$(bc <<< "scale=4 ; 1.0 / $reps")
normalize_bedGraph.py -i tmpPlus${pair}.bg -s $scaleall -o ${name}_plus${pair}_scaled.bg
normalize_bedGraph.py -i tmpMinus${pair}.bg -s $scaleall -o ${name}_minus${pair}_scaled.bg
sort -k1,1 -k2,2n ${name}_plus${pair}_scaled.bg > ${name}_plus${pair}_scaled_sorted.bg
sort -k1,1 -k2,2n ${name}_minus${pair}_scaled.bg > ${name}_minus${pair}_scaled_sorted.bg
bedGraphToBigWig ${name}_plus${pair}_scaled_sorted.bg $chrSizes ${name}_plus${pair}_scaled.bigWig
bedGraphToBigWig ${name}_minus${pair}_scaled_sorted.bg $chrSizes ${name}_minus${pair}_scaled.bigWig
rm ${name}_plus${pair}_scaled.bg
rm ${name}_minus${pair}_scaled.bg
rm ${name}_plus${pair}_scaled_sorted.bg
rm ${name}_minus${pair}_scaled_sorted.bg
rm tmpPlus${pair}.bg
rm tmpMinus${pair}.bg
3.2 Transcription start site (TSS) Inference analysis
TSS Annotations
# download the gencode.v42.basic.annotation.gtf.gz file from: https://www.gencodegenes.org/human/release_42.html
# content: Basic gene annotation; region: Chr; Download: GTF
#mkdir Annotation_v42
#cd Annotation_v42
wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_42/gencode.v42.basic.annotation.gtf.gz
gunzip gencode.v42.basic.annotation.gtf.gz
grep 'transcript_type "protein_coding"' gencode.v42.basic.annotation.gtf | \
grep -v 'tag "readthrough_transcript"' | \
grep -v 'tag "RNA_Seq_supported_only"' | \
grep -v 'tag "RNA_Seq_supported_partial"' | \
awk '{if($3=="exon"){print $0}} ' | \
grep -w "exon_number 1" | \
cut -f1,4,5,7,9 | tr ";" "\t" | \
awk '{for(i=5;i<=NF;i++){if($i~/^gene_name/){a=$(i+1)}} print $1,$2,$3,a,"na",$4}' | \
tr " " "\t" | tr -d '"' | \
grep -v "\tENSG" > gencode.hg38.v42.basic.firstExon.latest.filtered.bed
TSS
# at the TSS working directory (where you saved the merged bigWig files)
module load R/4.1.2
R
# source TSSinference functions
source("/home/FCAM/ssun/scripts/TSSinference.R")
library(bigWig)
# prepare annotation file
gene.exon1 = parse.bed.exon1("/home/FCAM/ssun/Annotation_v42/gencode.hg38.v42.basic.firstExon.latest.filtered.bed")
# assign bigWig variables
# These two are the ones with control + experimental merged
bw.plus='MCF7_dTAGGATAClone522_30min_plus_PE2_scaled.bigWig'
bw.minus='MCF7_dTAGGATAClone522_30min_minus_PE2_scaled.bigWig'
#potential.tss = TSSinference(gene.exon1, bw.plus, bw.minus, densityFilter=TRUE)
potential.tss2 = TSSinference(gene.exon1, bw.minus, bw.plus, densityFilter=TRUE) # looking for upstream antisense TSSs
head(potential.tss2)
# chrom start end gene misc strand height up down
#1 chr19 58353491 58353492 A1BG A1BG - NA NA NA
#2 chr10 50885674 50885675 A1CF A1CF - NA NA NA
#3 chr12 9115918 9115919 A2M A2M - NA NA NA
#4 chr12 8822621 8822622 A2ML1 A2ML1 + NA NA NA
#7 chr3 138132389 138132390 A4GNT A4GNT - NA NA NA
#10 chr3 151814073 151814074 AADAC AADAC + NA NA NA
3.3 Primary Transcription Annotations
Annotations
#mkdir Annotation_v42
#cd Annotation_v42
#wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_42/gencode.v42.basic.annotation.gtf.gz
#gunzip gencode.v42.basic.annotation.gtf.gz
# get the coordinates of first exon for every protein-coding gene
grep 'transcript_type "protein_coding"' gencode.v42.basic.annotation.gtf | \
awk '{if($3=="exon"){print $0}}' | \
grep -w "exon_number 1" | \
cut -f1,4,5,7,9 | tr ";" "\t" | \
awk '{for(i=5;i<=NF;i++){if($i~/^gene_name/){a=$(i+1)}} print $1,$2,$3,a,"na",$4}' | \
tr " " "\t" | tr -d '"' > gencode.v42.firstExon.bed
# get all transcripts
grep 'transcript_type "protein_coding"' gencode.v42.basic.annotation.gtf | \
awk '{if($3=="transcript"){print $0}} ' | \
cut -f1,4,5,7,9 | tr ";" "\t" | \
awk '{for(i=5;i<=NF;i++){if($i~/^gene_name/){a=$(i+1)}} print $1,$2,$3,a,"na",$4}' | \
tr " " "\t" | tr -d '"' > gencode.v42.transcript.bed
PTA
1.Loading key packages and preprocessed Pro-seq data
#if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#BiocManager::install("NMF")
library(NMF)
library(dplyr)
library(bigWig)
#BiocManager::install("pracma")
library(pracma)
library(RColorBrewer)
# https://github.com/WarrenDavidAnderson/genomicsRpackage/tree/master/primaryTranscriptAnnotation
#library(devtools)
#install_github("WarrenDavidAnderson/genomicsRpackage/primaryTranscriptAnnotation")
#library(primaryTranscriptAnnotation)
### facing troubles while installing the primaryTranscriptAnnotation packages, doing the following instead:
# download genomicsRpackage-master.zip from https://github.com/WarrenDavidAnderson/genomicsRpackage
# unzip genomicsRpackage-master.zip
source("/home/FCAM/ssun/primaryTranscriptsAnnotation/genomicsRpackage-master/primaryTranscriptAnnotation/R/gene_ann.R")
source("/home/FCAM/ssun/primaryTranscriptsAnnotation/genomicsRpackage-master/primaryTranscriptAnnotation/R/map_tu.R")
# import data for all 1's exons, annotate, and remove duplicate transcripts
fname = "/home/FCAM/ssun/Annotation_v42/gencode.v42.firstExon.bed"
dat0 = read.table(fname,header=F,stringsAsFactors=F)
names(dat0) = c('chr', 'start', 'end', 'gene', 'xy', 'strand')
dat0 = unique(dat0)
gencode.firstExon = dat0
# import data for all transcripts, annotate, and remove duplicate transcripts
fname = "/home/FCAM/ssun/Annotation_v42/gencode.v42.transcript.bed"
dat0 = read.table(fname,header=F,stringsAsFactors=F)
names(dat0) = c('chr', 'start', 'end', 'gene', 'xy', 'strand')
dat0 = unique(dat0)
gencode.transcript = dat0
# chrom size
chrom.sizes = read.table("/home/FCAM/ssun/Genome_pro/hg38.chrom.sizes",stringsAsFactors=F,header=F)
names(chrom.sizes) = c("chr","size")
# merged plus
plus.file = "MCF7_dTAGGATAClone522_30min_plus_PE1_scaled.bigWig"
# merged minus
minus.file = "MCF7_dTAGGATAClone522_30min_minus_PE1_scaled.bigWig"
bw.plus = load.bigWig(plus.file)
bw.minus = load.bigWig(minus.file)
obtain gene annotations use get.largest.interval();
count reads for each transcripts use read.count.transcript().
# Get the largest interval for each gene, given multiple TSS and TTS annotations
largest.interval.bed = get.largest.interval(bed=gencode.transcript)
# Evaluate reads and compute densities for each transcript of each gene.
transcript.reads = read.count.transcript(bed=gencode.transcript, bw.plus=bw.plus, bw.minus=bw.minus)
# str(transcript.reads)
#List of 2
# $ counts : Named num [1:19650] 0 0 0 249 144 ...
# ..- attr(*, "names")= chr [1:19650] "OR4F5" "OR4F29" "OR4F16" "SAMD11" ...
# $ density: Named num [1:19650] 0 0 0 0.01207 0.00954 ...
# ..- attr(*, "names")= chr [1:19650] "OR4F5" "OR4F29" "OR4F16" "SAMD11" ...
- Evaluate count and density distribution
jpeg(file="count_density_distribution.jpeg")
par(mfrow=c(1,2))
hist(log(transcript.reads$density), breaks=200,
col="black",xlab="log read density",main="")
hist(log(transcript.reads$counts), breaks=200,
col="black",xlab="log read count",main="")
dev.off()
Figure 2: Evaluate count and density distribution
- based on above distribution, we want to filter out genes with low/no expression
choose threshold
den.cut = -8
cnt.cut = 0
ind.cut.den = which(log(transcript.reads$density) < den.cut)
ind.cut.cnt = which(log(transcript.reads$counts) < cnt.cut)
ind.cut = union(ind.cut.den, ind.cut.cnt)
jpeg(file="cut_count_density_distribution.jpeg")
par(mfrow=c(1,2))
hist(log(transcript.reads$density), breaks=200,
col="black",xlab="log read density",main="")
abline(v=den.cut, col="red")
hist(log(transcript.reads$counts), breaks=200,
col="black",xlab="log read count",main="")
abline(v=cnt.cut, col="red")
dev.off()
Figure 3: Evaluate count and density distribution 2
- remove genes with low/no expression
unexp = names(transcript.reads$counts)[ind.cut]
largest.interval.expr.bed = largest.interval.bed[!(largest.interval.bed$gene %in% unexp),]
#head(largest.interval.bed)
# chr start end gene xy strand
#57158 chr19 58345183 58353492 A1BG na -
#31276 chr10 50799409 50885675 A1CF na -
#37570 chr12 9067708 9115919 A2M na -
#37561 chr12 8822621 8876787 A2ML1 na +
#1408 chr1 33306766 33321098 A3GALT2 na -
#60649 chr22 42692121 42720870 A4GALT na -
- TSS identification and remove overlaps
In previous TSS analysis session, we have identified transcription start site (TSS).
# from TSSInference
tss.interval.bed = merge(potential.tss2, largest.interval.expr.bed, by.x="gene", by.y="gene")
tss.interval.bed$end.x[tss.interval.bed$strand.x == "+"] <- tss.interval.bed$end.y[tss.interval.bed$strand.x == "+"]
tss.interval.bed$start.x[tss.interval.bed$strand.x == "-"] <- tss.interval.bed$start.y[tss.interval.bed$strand.x == "-"]
tss.interval.bed= tss.interval.bed[,c(2:5, 7, 6)]
colnames(tss.interval.bed) = c('chr', 'start', 'end', 'gene', 'xy', 'strand')
tss.interval.bed$xy <- 0
# head(tss.interval.bed)
# chr start end gene xy strand
#1 chr19 58345183 58353492 A1BG 0 -
#2 chr1 33306766 33321142 A3GALT2 0 -
#3 chr22 42692121 42718965 A4GALT 0 -
#4 chr12 53307459 53321610 AAAS 0 -
#5 chr12 125065402 125143316 AACS 0 +
#6 chr4 170060222 170090202 AADAT 0 -
Now we want to address overlaps (if multiple genes’ occupying same coordinates) and filter them out from TSSs.
#remove gene overlaps with the highest density
overlap.data = gene.overlaps(bed = tss.interval.bed)
filtered.id.overlaps = remove.overlaps(bed=tss.interval.bed,
overlaps=overlap.data$cases,
transcripts=gencode.transcript,
bw.plus=bw.plus,
bw.minus=bw.minus,
by="den")
- TTS identification
TTS: triplex target DNA site that can selectively form triple-helix structure, and regulate gene expression.
find TTS search regions use get.end.intervals();
identifying TTS use get.TTS(), which will generate a bed output with both TSSs and TTSs.
# get TTS intervals
add.to.end = 100000
fraction.end = 0.2
dist.from.start = 50
bed.for.tts.eval = get.end.intervals(bed=filtered.id.overlaps,
add.to.end=add.to.end,
fraction.end=fraction.end,
dist.from.start=dist.from.start)
# identify gene ends
add.to.end = max(bed.for.tts.eval$xy)
knot.div = 40
pk.thresh = 0.05
bp.bin = 50
knot.thresh = 5
cnt.thresh = 5
tau.dist = 50000
frac.max = 1
frac.min = 0.3
#save.image('230817_GATA3_pro_TSS.Rdata')
inferred.coords = get.TTS(bed=bed.for.tts.eval, tss= filtered.id.overlaps,
bw.plus=bw.plus, bw.minus=bw.minus,
bp.bin=bp.bin, add.to.end=add.to.end,
pk.thresh=pk.thresh, knot.thresh=knot.thresh,
cnt.thresh=cnt.thresh, tau.dist=tau.dist,
frac.max=frac.max, frac.min=frac.min,
knot.div=knot.div) # this takes long time to run
final.coords = inferred.coords$bed
write.table(final.coords, file = "MCF7_GATA3_gene_annotations.bed",
sep = "\t", row.names=FALSE, col.names=FALSE, quote=FALSE)
#save.image('230817_GATA3_pro_TSS2.Rdata')